#' ---
#' title: Reproduce Figure 4 (Congressional Floor Speeches)
#' date: 2024-10-01
#' version: 1.0
#' ---

library(tidyverse)
library(tidytext)

## Load and Tidy Data ------------------

# read data
d <- read_csv('application4-one-shot-gpt-3.csv')

# pivot the dataset; one row per virtue label
d2 <- d |> 
  # virtues are separated by commas in the output
  unnest_tokens(input = 'virtues',
                output = 'virtue',
                token = 'regex',
                pattern = ',') |> 
  # trim the white space
  mutate(virtue = str_trim(virtue))


# create super-classes of virtues, including cognates/synonyms
d2 <- d2 |> 
  mutate(virtue_class = 
           case_when(str_detect(virtue, 'loyal|duti|duty|steadfast|devot|allegi') ~ 'loyalty',
                     str_detect(virtue, 'brave|fearless|hero|gallant|valiant|cour|valor') ~ 'bravery',
                     str_detect(virtue, 'patrio') ~ 'patriotism',
                     str_detect(virtue, 'hard work|industrious|assiduous|diligen') ~ 'hard work',
                     str_detect(virtue, 'equitab|equity|egalitar|equal|impartial|fair') ~ 'fairness',
                     str_detect(virtue, 'kind|empath|humanity|caring|compassio') ~ 'compassion',
                     str_detect(virtue, 'philanthr|benevol|benefice|charit') ~ 'charity',
                     str_detect(virtue, 'success|achieve|merit') ~ 'success',
                     str_detect(virtue, 'educat|mentor|knowl|intellig') ~ 'education',
                     str_detect(virtue, 'advoca|activi') ~ 'advocacy',
                     str_detect(virtue, 'faith|belie') ~ 'faith',
                     str_detect(virtue, 'enviro|sustain') ~ 'sustainability',
                     str_detect(virtue, 'peace') ~ 'peace',
                     str_detect(virtue, 'sacrif|selfless') ~ 'sacrifice'
           )
  )

# count the number of speeches by party
speech_count <- d |> 
  count(party) |> 
  rename(num_speeches = n)

## Plot Differences Between Parties --------------------------

p <- d2 |> 
  select(speech_id, party, virtue_class) |> 
  unique() |> 
  filter(!is.na(virtue_class)) |> 
  filter(!(virtue_class %in% c('faith', 'peace', 'sustainability'))) |> 
  group_by(party) |> 
  count(virtue_class) |> 
  left_join(speech_count) |> 
  mutate(pct_speeches = round(n / num_speeches * 100, 1)) |> 
  ggplot(mapping = aes(x = virtue_class, y = pct_speeches, group = party, fill = party)) +
  geom_bar(stat = "identity", width = 0.5, position = position_dodge())+
  geom_text(aes(label=pct_speeches), position = position_dodge(1), size = 3, vjust = -1) +
  geom_col(position="dodge") +
  labs(x = "Virtue",
       y = "% of Speeches Mentioning Virtue",
       fill = 'Party') +
  scale_fill_grey() +
  theme_minimal()

p

ggsave(plot = p,
       filename = 'figure4.png',
       width=10, height=6)
